Assignment 3

© 2023, Scripnic Dinu, all rights reserved ## 3.1 Mobile phone picture

# set working directory to the location of the script
if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}
# load libraries
library(data.table)
library(magick)
## Warning: package 'magick' was built under R version 4.2.3
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(cluster) 
library(imager)
## Warning: package 'imager' was built under R version 4.2.3
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:imager':
## 
##     highlight
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.2.3
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(FNN)
## Warning: package 'FNN' was built under R version 4.2.3

3.1 Mobile phone picture

3.1.1

  • Take a picture with your smartphone. The picture must contain a piece of paper with your name on it and some type of background (walls, floor, window, etc.).
  • Resize it to a manageable size (e.g., 256x256) either with R or Py
img <- imager::load.image("./img.jpg")
img <- imager::resize(img, size_x = 256, size_y = 256)
plot(img)

pixels <- as.data.frame(img, wide="c")
setDT(pixels)

3.1.2

  • Reduce the number of RGB colors in the image by using the k-means algorithm.
wss <- sapply(1:10, function(k) kmeans(pixels, k)$tot.withinss)
plot(1:10, wss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

### Conclusion * The optimal number of clusters is 4

best_k <- 4
km <- kmeans(pixels, centers = best_k)
dt_newimg <- data.table(
  x = pixels[, x],
  y = pixels[, y],
  R = km$centers[km$cluster, "c.1"],
  G = km$centers[km$cluster, "c.2"],
  B = km$centers[km$cluster, "c.3"])
plot_ly(data = dt_newimg,
        x = ~x,
        y = ~y,
        type = "scattergl",
        mode = "markers",
        marker = list(color = ~rgb(R, G, B))) |>
  layout(yaxis = list(autorange = "reversed", scaleanchor = "x", scaleratio = 1))
# fviz_cluster(km, data = dt_rgb) # if there are more than 2 dim, it uses PCA

3.2 Drilling data

3.2.1 Load the data

data <- read.csv("./drilling.csv")
summary(data)
##        x                  y           
##  Min.   :-0.04969   Min.   :-0.05967  
##  1st Qu.: 0.19177   1st Qu.: 0.18619  
##  Median : 0.40684   Median : 0.40299  
##  Mean   : 0.41582   Mean   : 0.45929  
##  3rd Qu.: 0.65068   3rd Qu.: 0.73264  
##  Max.   : 0.88515   Max.   : 1.06394
# do a scatterplot 
plot(data$x, data$y, type="p")

# feedData is columns x and y
feedData <- data[, c("x", "y")]

3.2.2 K-means clustering

wss <- sapply(1:10, function(k) kmeans(feedData, k)$tot.withinss)
plot(1:10, wss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

### Conclusion * The optimal number of clusters is 4

best_k <- 4
kmeans_model <- kmeans(feedData, centers = best_k)
# add the cluster to the data
data$kmeans_cluster <- kmeans_model$cluster
plot(data$x, data$y, type="p", col=data$kmeans_cluster)

### 3.2.3 Hierarchical clustering

# library agnes
linkage_methods <- c("average", "single", "complete", "ward")
hcl <- list()
for (linkage in linkage_methods) {
  hcl[[linkage]] <- agnes(feedData, method = linkage)
}
plot(hcl$average, which.plots = 2, main = "Average linkage")

plot(hcl$single, which.plots = 2, main = "Single linkage")

plot(hcl$complete, which.plots = 2, main = "Complete linkage")

plot(hcl$ward, which.plots = 2, main = "Ward linkage")

  • We can see that the best linkage method is ward
best_linkage <- "ward"
hcl_model <- agnes(feedData, method = best_linkage)
plot(hcl$ward, which.plots = 2, main = "Ward linkage")

abline(h = 1.5, col = "red", lwd = 3)

# we can cut at 1.5 to have 4 clusters
tree <- cutree(hcl_model, k=4)
data$hcl_cluster <- tree
plot(data$x, data$y, type="p", col=data$hcl_cluster)

3.2.4 DBSCAN

  • First we have to find the best minPts and eps
  • Best minPts is 4 because we have 2 dimensions
  • To find the best eps we have to:
    1. Get the distance using KNN
    2. Plot the distances in an ascending order
    3. Identify the value where the curve has the biggest change
# download the library
library(FNN)
minPts <- 4
# 1. Get the distance using KNN
knn_dist <- knn.dist(feedData, k = minPts)
# 2. Sort the distances in an ascending order
knn_dist <- sort(knn_dist, decreasing = FALSE)
# 3. Plot the distances
plot(knn_dist)
# deaw a line at eps = 0.06, very thick
abline(h = 0.06, col = "red", lwd = 3)

# 4. Identify the value where the curve has the biggest change
eps <- 0.06
  • Now we can run the DBSCAN algorithm
library(dbscan)
dbscan_model <- dbscan(feedData, eps = eps, minPts = minPts)
data$dbscan_cluster <- dbscan_model$cluster
plot(data$x, data$y, type="p", col=data$dbscan_cluster)